library(magrittr)
library(stringr)
library(plyr)
library(tidyverse)
library(readxl)
library(stm)
library(vietnamcode)


rm(list=ls())
home = 'C:/Users/Jason/Dropbox/VNA_Responsiveness/Analysis/JOP-dataverse/'


dv_pooled = paste0(home, 'pooled-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  arrange(Province, Name_VN)
dv_caucus = paste0(home, 'caucus-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  arrange(Province, Name_VN)
dv_query = paste0(home, 'query-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  arrange(Province, Name_VN)
dv_floor = paste0(home, 'floor-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  arrange(Province, Name_VN)


df_delegate_raw = paste0(home, '14th-VNA-membership.csv') %>%
  read_csv %>%
  mutate(Province=mapvalues(x=province, 
                            from=vietnamcode_data$province_name_diacritics, 
                            to=vietnamcode_data$province_name),
         Province=as.character(Province))
df_delegate_raw$Province[str_detect(df_delegate_raw$province, 'a - V')] = 'BRVT'
df_delegate_raw = arrange(df_delegate_raw, Province, name)
df_delegate_raw$ID = llply(.data=df_delegate_raw$name, 
                           .fun=function(x) {
  found = which(str_detect(string=dv_pooled$Name_VN, pattern=x))
  if(length(found)<1) {
    NA
  } else if(length(found)==1) {
    dv_pooled$ID[found]
  } else {
    999L
  }
}, .inform=T) %>% unlist
df_delegate_raw$ID[is.na(df_delegate_raw$ID) | df_delegate_raw$ID==999L] = as.integer(c(59,57,157,257,149,201,233,134,133,455,449,115,262,260,197,120,33,369,215,88,298,174,312,422,418,419,69,393,137,221,109,253,NA,22,404,92,9,234,236,19,331,163,514,302,358,81))
df_delegate_raw_pooled = merge(x=df_delegate_raw, 
                               y=subset(dv_pooled, 
                                        select=-c(Province,
                                                  Dosage)), 
                               by='ID', 
                               all.x=T) %>%
  subset(!is.na(ID))
df_delegate_raw_caucus = merge(x=df_delegate_raw, 
                               y=subset(dv_caucus, 
                                        select=-c(Province,
                                                  Dosage)), 
                               by='ID', 
                               all.x=T) %>%
  subset(!is.na(ID))
df_delegate_raw_query = merge(x=df_delegate_raw, 
                              y=subset(dv_query, 
                                       select=-c(Province,
                                                 Dosage)), 
                              by='ID', 
                              all.x=T) %>%
  subset(!is.na(ID))
df_delegate_raw_floor = merge(x=df_delegate_raw, 
                              y=subset(dv_floor, 
                                       select=-c(Province,
                                                 Dosage)), 
                              by='ID', 
                              all.x=T) %>%
  subset(!is.na(ID))


coded.query = paste0(home, 'keyword-coded-transcripts.xlsx') %>%
  read_xlsx(sheet='Query') %>%
  set_colnames(mapvalues(x=colnames(.), 
                         from=c('Original','Answering'), 
                         to=c('ID','Unusual'))) %>%
  subset(!is.na(ID), 
         select=-c(Province, 
                   Name, 
                   Articles, 
                   EducationLevel, 
                   Critical:Business)) %>%
  merge(y=subset(df_delegate_raw_query, 
                 select=c(ID, 
                          Province, 
                          Treatment, 
                          Citizen, 
                          Firm, 
                          FullTime, 
                          CentNom, 
                          Competitive)), 
        by='ID')
coded.floor = paste0(home, 'keyword-coded-transcripts.xlsx') %>%
  read_xlsx(sheet='Floor') %>%
  set_colnames(mapvalues(x=colnames(.), 
                         from=c('Original','Answer'), 
                         to=c('ID','Unusual'))) %>%
  subset(!is.na(ID), 
         select=-c(Province, 
                   Name, 
                   Articles, 
                   EducationLevel, 
                   Critical:Business)) %>%
  merge(y=subset(df_delegate_raw_floor, 
                 select=c(ID, 
                          Province, 
                          Treatment, 
                          Citizen, 
                          Firm, 
                          FullTime, 
                          CentNom, 
                          Competitive)), 
        by='ID')
coded.caucus = paste0(home, 'keyword-coded-transcripts.xlsx') %>%
  read_xlsx(sheet='Caucus') %>%
  set_colnames(mapvalues(x=colnames(.), 
                         from=c('Original','Moderator'), 
                         to=c('ID','Unusual'))) %>%
  subset(!is.na(ID), 
         select=-c(Province, 
                   Name, 
                   Articles, 
                   EducationLevel, 
                   Critical:Business)) %>%
  merge(y=subset(df_delegate_raw_caucus, 
                 select=c(ID, 
                          Province, 
                          Treatment, 
                          Citizen, 
                          Firm, 
                          FullTime, 
                          CentNom, 
                          Competitive)), 
        by='ID')
vocab.query = str_split(string=coded.query$Keywords, 
                        pattern=', | - ') %>%
  unlist %>%
  na.omit %>%
  unique %>%
  sort
vocab.floor = str_split(string=coded.floor$Keywords, 
                        pattern=', | - ') %>%
  unlist %>%
  na.omit %>%
  unique %>%
  sort
vocab.caucus = str_split(string=coded.caucus$Keywords, 
                         pattern=', | - ') %>%
  unlist %>%
  na.omit %>%
  unique %>%
  sort
vocab = c(vocab.query, 
          vocab.floor, 
          vocab.caucus) %>%
  unique %>%
  sort
texts.caucus = adply(.data=coded.caucus, 
                     .margins=1, 
                     .fun=function(x) {
  words = str_split(string=x$Keywords, 
                    pattern=', | - ') %>%
    unlist %>%
    na.omit
  if(length(words)>0) {
    place = match(x=words, table=vocab)
    data.frame(Forum='Caucus', 
               Token=words, 
               Index=place, 
               stringsAsFactors=F)
  }
}, .progress='text', .inform=T)
texts.floor = adply(.data=coded.floor, 
                    .margins=1, 
                    .fun=function(x) {
  words = str_split(string=x$Keywords, 
                    pattern=', | - ') %>%
    unlist %>%
    na.omit
  if(length(words)>0) {
    place = match(words, vocab)
    data.frame(Forum='Floor', 
               Token=words, 
               Index=place, 
               stringsAsFactors=F)
  }
}, .progress='text', .inform=T)
texts.query = adply(.data=coded.query, 
                    .margins=1, 
                    .fun=function(x) {
  words = str_split(string=x$Keywords, 
                    pattern=', | - ') %>%
    unlist %>%
    na.omit
  if(length(words)>0) {
    place = match(words, vocab)
    data.frame(Forum='Query', 
               Token=words, 
               Index=place, 
               stringsAsFactors=F)
  }
}, .progress='text', .inform=T)
texts = rbind(texts.caucus,
              texts.floor,
              texts.query) %>%
  mutate(Forum=factor(x=Forum, 
                      levels=c('Caucus',
                               'Query',
                               'Floor'))) %>%
  subset(select=-Keywords)


documents = dlply(.data=texts, 
                  .variables='ID', 
                  .fun=function(x) {
  plyr::count(x$Index) %>%
    as.matrix %>%
    set_colnames(NULL) %>%
    t
}, .progress='text', .inform=T)
prepped = prepDocuments(documents=documents, 
                        vocab=vocab, 
                        lower.thresh=0L, 
                        upper.thresh=Inf, 
                        verbose=T,
                        meta=distinct(subset(texts, 
                                             select=-c(Token, 
                                                       Index, 
                                                       Unusual)), 
                                      ID, 
                                      .keep_all=T))
modeled = stm(documents=prepped$documents, 
              vocab=prepped$vocab, 
              K=2, 
              data=prepped$meta, 
              init.type='Spectral', 
              seed=31415,
              prevalence=~Forum + Citizen + Firm + FullTime + CentNom + Competitive)
estimated = estimateEffect(formula=~Forum + Citizen + Firm + FullTime + CentNom + Competitive, 
                           stmobj=modeled, 
                           metadata=prepped$meta, 
                           uncertainty='Global', 
                           nsims=100)
labeled = labelTopics(model=modeled, 
                      n=10)
labels = apply(labeled$frex, 1, function(x) {
  paste(paste(x[1:5], 
              collapse=', '), 
        paste(x[6:10], 
              collapse=', '), 
        sep='\n')
})
effects = NULL
for(k in 1:2) {
  effects = ldply(.data=estimated$parameters[[k]], 
                  .fun=function(x) x$est) %>%
    apply(MARGIN=2, 
          FUN=quantile, 
          c(0.025, 0.5, 0.975)) %>%
    t %>%
    set_colnames(c('low_95',
                   'median',
                   'upp_95')) %>%
    as.data.frame %>%
    mutate(Topic=k, 
           Label=mapvalues(x=k, 
                           from=1:2, 
                           to=labels, 
                           warn_missing=F), 
           Coefficient=names(estimated$parameters[[k]][[1]]$est)) %>%
    rbind(effects, .)
}
effects$Coefficient = mapvalues(x=effects$Coefficient, 
                                from=c('(Intercept)',
                                       'ForumQuery',
                                       'ForumFloor'), 
                                to=c('Constant',
                                     'Query',
                                     'Floor'))
effects$English[effects$Topic==1] = 'school board;\nuniversity area;\ntraining time;\ndecision-making authority of the school;\nranking of university educational units;\nappointments and hiring;\ncost of training services;\ntraining associations;\nexplain the expression;\nnational university'
effects$English[effects$Topic==2] = 'broadening education;\npublic school;\npedagogy;\nmanagement;\ntarget enrollment;\ninternational integration;\nno school fees;\nquality of training;\nregular education'
effects$Title[effects$Topic==1] = 'Topic 1: national educational system'
effects$Title[effects$Topic==2] = 'Topic 2: school-level reforms'


g = subset(effects, Coefficient %in% c('Citizen','Firm')) %>%
  ggplot(aes(x=Coefficient, y=median, ymin=low_95, ymax=upp_95)) +
  geom_hline(yintercept=0, color='red', linetype=2) +
  geom_linerange(size=1) +
  geom_point(size=3) +
  facet_wrap(~Title, nrow=1) +
  coord_fixed(ratio=4) +
  labs(y='Estimate', x=NULL) +
  theme_minimal() +
  theme(strip.text=element_text(size=10, hjust=0),
        panel.border=element_rect(fill=NA, color='black'))
g

ggsave(filename='figure-07.png', plot=g, path=home, width=4.5, height=3, units='in')
ggsave(filename='figure-07.tiff', plot=g, path=home, width=4.5, height=3, units='in')
ggsave(filename='figure-07.eps', g, path=home, width=4.5, height=3, units='in', device=cairo_ps)
